{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz\n",
    "# !tar -zxf train-en-vi.tgz\n",
    "# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz\n",
    "# !tar -zxf dev-2012-en-vi.tgz\n",
    "# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz\n",
    "# !tar -zxf test-2013-en-vi.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install malaya --no-deps\n",
    "# !pip3 install bert-tensorflow\n",
    "# !pip3 install toolz\n",
    "# !pip3 install pysastrawi\n",
    "# !pip3 install fuzzywuzzy\n",
    "# !pip3 install xgboost\n",
    "# !pip3 install ftfy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/home/husein/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "import re\n",
    "\n",
    "tokenizer = malaya.preprocessing.SocialTokenizer().tokenize\n",
    "\n",
    "def is_number_regex(s):\n",
    "    if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
    "        return s.isdigit()\n",
    "    return True\n",
    "\n",
    "def preprocessing(string):\n",
    "    tokenized = tokenizer(string)\n",
    "    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]\n",
    "    return tokenized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(133317, 133317)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('train.en') as fopen:\n",
    "    train_english = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "with open('train.vi') as fopen:\n",
    "    train_vietnam = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "len(train_english), len(train_vietnam)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Rachel Pike : The science behind a climate headline',\n",
       " 'Khoa học đằng sau một tiêu đề về khí hậu')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_english[0], train_vietnam[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 133317/133317 [00:16<00:00, 8024.56it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i in tqdm(range(len(train_english))):\n",
    "    train_english[i] = ' '.join(preprocessing(train_english[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 133317/133317 [00:21<00:00, 6136.78it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(train_vietnam))):\n",
    "    train_vietnam[i] = ' '.join(preprocessing(train_vietnam[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1553, 1553)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('tst2012.en') as fopen:\n",
    "    test_english_2012 = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "with open('tst2012.vi') as fopen:\n",
    "    test_vietnam_2012 = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "len(test_english_2012), len(test_vietnam_2012)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1553/1553 [00:00<00:00, 9039.32it/s]\n",
      "100%|██████████| 1553/1553 [00:00<00:00, 6949.77it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(test_english_2012))):\n",
    "    test_english_2012[i] = ' '.join(preprocessing(test_english_2012[i]))\n",
    "    \n",
    "for i in tqdm(range(len(test_vietnam_2012))):\n",
    "    test_vietnam_2012[i] = ' '.join(preprocessing(test_vietnam_2012[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1268, 1268)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('tst2013.en') as fopen:\n",
    "    test_english_2013 = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "with open('tst2013.vi') as fopen:\n",
    "    test_vietnam_2013 = fopen.read().split('\\n')[:-1]\n",
    "    \n",
    "len(test_english_2013), len(test_vietnam_2013)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1268/1268 [00:00<00:00, 2778.69it/s]\n",
      "100%|██████████| 1268/1268 [00:00<00:00, 2050.06it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(test_english_2013))):\n",
    "    test_english_2013[i] = ' '.join(preprocessing(test_english_2013[i]))\n",
    "    \n",
    "for i in tqdm(range(len(test_vietnam_2013))):\n",
    "    test_vietnam_2013[i] = ' '.join(preprocessing(test_vietnam_2013[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, train_Y = [], []\n",
    "for i in range(len(train_english)):\n",
    "    if len(train_english[i].split()) > 250:\n",
    "        continue\n",
    "    train_X.append(train_english[i])\n",
    "    train_Y.append(train_vietnam[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_X, test_Y = [], []\n",
    "for i in range(len(test_english_2012)):\n",
    "    if len(test_english_2012[i].split()) > 250:\n",
    "        continue\n",
    "    test_X.append(test_english_2012[i])\n",
    "    test_Y.append(test_vietnam_2012[i])\n",
    "    \n",
    "for i in range(len(test_english_2013)):\n",
    "    if len(test_english_2013[i].split()) > 250:\n",
    "        continue\n",
    "    test_X.append(test_english_2013[i])\n",
    "    test_Y.append(test_vietnam_2013[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import json\n",
    "\n",
    "def build_dataset(words, n_words, atleast=1):\n",
    "    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]\n",
    "    counter = collections.Counter(words).most_common(n_words)\n",
    "    counter = [i for i in counter if i[1] >= atleast]\n",
    "    count.extend(counter)\n",
    "    dictionary = dict()\n",
    "    for word, _ in count:\n",
    "        dictionary[word] = len(dictionary)\n",
    "    data = list()\n",
    "    unk_count = 0\n",
    "    for word in words:\n",
    "        index = dictionary.get(word, 0)\n",
    "        if index == 0:\n",
    "            unk_count += 1\n",
    "        data.append(index)\n",
    "    count[0][1] = unk_count\n",
    "    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
    "    return data, count, dictionary, reversed_dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 47925\n",
      "Most common words [(',', 155595), ('.', 134615), ('the', 102861), ('to', 65617), (\"'\", 63980), ('of', 60178)]\n",
      "Sample data [6514, 16802, 55, 58, 335, 591, 11, 731, 5458, 132] ['Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', 'In']\n"
     ]
    }
   ],
   "source": [
    "concat_from = ' '.join(train_X).split()\n",
    "vocabulary_size_from = len(list(set(concat_from)))\n",
    "data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)\n",
    "print('vocab from size: %d'%(vocabulary_size_from))\n",
    "print('Most common words', count_from[4:10])\n",
    "print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab to size: 22341\n",
      "Most common words [(',', 128187), ('.', 125091), ('là', 57919), ('tôi', 51677), ('một', 48925), ('có', 48134)]\n",
      "Sample data [1909, 66, 1135, 128, 8, 371, 111, 38, 411, 723] ['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']\n"
     ]
    }
   ],
   "source": [
    "concat_to = ' '.join(train_Y).split()\n",
    "vocabulary_size_to = len(list(set(concat_to)))\n",
    "data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)\n",
    "print('vocab to size: %d'%(vocabulary_size_to))\n",
    "print('Most common words', count_to[4:10])\n",
    "print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train-test.json', 'w') as fopen:\n",
    "    json.dump({'train_X': train_X, 'train_Y': train_Y,\n",
    "              'test_X': test_X,\n",
    "              'test_Y': test_Y}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dictionary.json', 'w') as fopen:\n",
    "    json.dump({'from': {'dictionary': dictionary_from, 'rev_dictionary': rev_dictionary_from},\n",
    "              'to': {'dictionary': dictionary_to, 'rev_dictionary': rev_dictionary_to}}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
